/*******************************************************************************
* Copyright 2020 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "cpu/x64/jit_generator.hpp"

#include "cpu/x64/gemm/s8x8s32/common_u8.hpp"

namespace dnnl {
namespace impl {
namespace cpu {
namespace x64 {

jit_sse41_kernel_b0_r_gemm_s8u8s32_kern::
        jit_sse41_kernel_b0_r_gemm_s8u8s32_kern()
    : jit_generator(nullptr, S8U8S32_COMPUTE_KERNEL_CODE_SIZE) {}

void jit_sse41_kernel_b0_r_gemm_s8u8s32_kern::generate() {

#ifndef _WIN32

#define M rdi
#define N rsi
#define K rdx
#define A r8
#define B r9
#define C r10
#define LDC r11

#define AA rcx
#define I r12
#define J r13
#define H rax
#define AO r14
#define BO r15
#define CO1 rbx
#define CO2 rbp

#else

#define M rcx
#define N rdx
#define K r8
#define A rsi
#define B r9
#define C r10
#define LDC r11

#define AA rdi
#define I r12
#define J r13
#define H rax
#define AO r14
#define BO r15
#define CO1 rbx
#define CO2 rbp

#endif

#ifdef _WIN32
#define ARG_A (args_offset - 16) + rsp
#define ARG_B (args_offset - 8) + rsp
#endif
#define ARG_C ((args_offset + 0) + rsp)
#define ARG_LDC ((args_offset + 8) + rsp)
#define ARG_COFFSET_R ((args_offset + 24) + rsp)

#define COFFSET_RX (16 + rsp)
#define COFFSET_RY (24 + rsp)

    inLocalLabel();
    {

        Xbyak::Label l100;
        Xbyak::Label l1038;
        Xbyak::Label l1048;
        Xbyak::Label l10dc;
        Xbyak::Label l1114;
        Xbyak::Label l116c;
        Xbyak::Label l11d4;
        Xbyak::Label l120c;
        Xbyak::Label l1210;
        Xbyak::Label l1250;
        Xbyak::Label l1278;
        Xbyak::Label l1304;
        Xbyak::Label l1318;
        Xbyak::Label l13a4;
        Xbyak::Label l13e0;
        Xbyak::Label l1434;
        Xbyak::Label l1494;
        Xbyak::Label l14e4;
        Xbyak::Label l1514;
        Xbyak::Label l1578;
        Xbyak::Label l1584;
        Xbyak::Label l15e8;
        Xbyak::Label l1610;
        Xbyak::Label l1650;
        Xbyak::Label l169c;
        Xbyak::Label l16c4;
        Xbyak::Label l16c8;
        Xbyak::Label l1708;
        Xbyak::Label l1730;
        Xbyak::Label l17bc;
        Xbyak::Label l17d0;
        Xbyak::Label l185c;
        Xbyak::Label l1898;
        Xbyak::Label l18ec;
        Xbyak::Label l194c;
        Xbyak::Label l1998;
        Xbyak::Label l19c8;
        Xbyak::Label l1a2c;
        Xbyak::Label l1a38;
        Xbyak::Label l1a9c;
        Xbyak::Label l1ac4;
        Xbyak::Label l1b04;
        Xbyak::Label l1b50;
        Xbyak::Label l1b78;
        Xbyak::Label l1b7c;
        Xbyak::Label l1bbc;
        Xbyak::Label l1be4;
        Xbyak::Label l1c70;
        Xbyak::Label l1c84;
        Xbyak::Label l1d10;
        Xbyak::Label l1d4c;
        Xbyak::Label l1da0;
        Xbyak::Label l1e00;
        Xbyak::Label l1e50;
        Xbyak::Label l1e80;
        Xbyak::Label l1ee4;
        Xbyak::Label l1ef0;
        Xbyak::Label l1f54;
        Xbyak::Label l1f7c;
        Xbyak::Label l1fbc;
        Xbyak::Label l2008;
        Xbyak::Label l2030;
        Xbyak::Label l2034;
        Xbyak::Label l280;
        Xbyak::Label l294;
        Xbyak::Label l414;
        Xbyak::Label l4b8;
        Xbyak::Label l58c;
        Xbyak::Label l67c;
        Xbyak::Label l728;
        Xbyak::Label l770;
        Xbyak::Label l864;
        Xbyak::Label l874;
        Xbyak::Label l968;
        Xbyak::Label l98;
        Xbyak::Label l9c4;
        Xbyak::Label la4c;
        Xbyak::Label laf4;
        Xbyak::Label lb48;
        Xbyak::Label lb5c;
        Xbyak::Label lb9c;
        Xbyak::Label lbc8;
        Xbyak::Label lc8;
        Xbyak::Label lca4;
        Xbyak::Label lcb8;
        Xbyak::Label ld94;
        Xbyak::Label ldf0;
        Xbyak::Label le6c;
        Xbyak::Label lefc;
        Xbyak::Label lf68;
        Xbyak::Label lfa4;

        auto stack_alloc_size = 32;
        auto args_offset = stack_alloc_size + get_size_of_abi_save_regs() + 8;
#ifdef _WIN32
        args_offset += 48;
#endif
        preamble();
        sub(rsp, stack_alloc_size);
#ifdef _WIN32
        mov(A, ptr[ARG_A]);
        mov(B, ptr[ARG_B]);
#endif

        mov(C, qword[ARG_C]);
        mov(LDC, qword[ARG_LDC]);
        sub(A, -128);
        sub(B, -128);
        mov(M, qword[M]);
        mov(N, qword[N]);
        mov(K, qword[K]);
        lea(LDC, ptr[LDC * 4 + 0x0]);
        mov(H, qword[ARG_COFFSET_R]);
        mov(qword[COFFSET_RX], H);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        xorps(xmm12, xmm12);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(H, 0x10001);
        movq(xmm7, H);
        pshufd(xmm7, xmm7, 0x0);
        mov(J, M);
        cmp(J, 0x10);
        jl(lb5c, T_NEAR);
        align(4);

        L(l98);
        mov(CO1, C);
        add(C, 0x40);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x20);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(H, qword[COFFSET_RX]);
        mov(qword[COFFSET_RY], H);
        mov(I, N);
        cmp(I, 0x2);
        jl(l728, T_NEAR);
        align(4);

        L(lc8);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm1, xword[AO - 0x70]);
        movdqu(xmm2, xword[AO - 0x60]);
        movdqu(xmm3, xword[AO - 0x50]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l414, T_NEAR);
        sub(H, 0x8);
        jle(l280, T_NEAR);
        align(4);

        L(l100);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        movdqu(xmm2, xword[AO - 0x20]);
        movdqu(xmm3, xword[AO - 0x10]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO]);
        movdqu(xmm1, xword[AO + 0x10]);
        movdqu(xmm2, xword[AO + 0x20]);
        movdqu(xmm3, xword[AO + 0x30]);
        add(AA, 0x4);
        add(AO, 0x80);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l100, T_NEAR);
        align(4);

        L(l280);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(l414, T_NEAR);
        align(4);

        L(l294);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        movdqu(xmm2, xword[AO - 0x20]);
        movdqu(xmm3, xword[AO - 0x10]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO]);
        movdqu(xmm1, xword[AO + 0x10]);
        movdqu(xmm2, xword[AO + 0x20]);
        movdqu(xmm3, xword[AO + 0x30]);
        add(AA, 0x4);
        add(AO, 0x80);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l294, T_NEAR);
        align(4);

        L(l414);
        mov(H, K);
        test(H, 0x4);
        je(l4b8, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        add(AO, 0x40);
        add(BO, 0x8);
        align(4);

        L(l4b8);
        mov(H, K);
        test(H, 0x2);
        je(l58c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        punpckhwd(xmm1, xmm6);
        movdqu(xmm2, xword[AO - 0x70]);
        movaps(xmm3, xmm2);
        punpcklwd(xmm2, xmm6);
        punpckhwd(xmm3, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        add(AO, 0x20);
        add(BO, 0x4);
        align(4);

        L(l58c);
        mov(H, K);
        test(H, 0x1);
        je(l67c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        pshufd(xmm1, xmm3, 0x55);
        punpcklbw(xmm1, xmm6);
        punpcklwd(xmm1, xmm6);
        pshufd(xmm2, xmm3, 0xaa);
        punpcklbw(xmm2, xmm6);
        punpcklwd(xmm2, xmm6);
        pshufd(xmm3, xmm3, 0xff);
        punpcklbw(xmm3, xmm6);
        punpcklwd(xmm3, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm11, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm13, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm15, xmm4);
        add(AO, 0x10);
        add(BO, 0x2);
        align(4);

        L(l67c);
        mov(H, qword[COFFSET_RY]);
        movd(xmm0, dword[H]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm8, xmm0);
        paddd(xmm10, xmm0);
        paddd(xmm12, xmm0);
        paddd(xmm14, xmm0);
        movd(xmm0, dword[H + 0x4]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm9, xmm0);
        paddd(xmm11, xmm0);
        paddd(xmm13, xmm0);
        paddd(xmm15, xmm0);
        add(qword[COFFSET_RY], 0x8);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + 0x10], xmm10);
        xorps(xmm10, xmm10);
        movdqu(xword[CO1 + 0x20], xmm12);
        xorps(xmm12, xmm12);
        movdqu(xword[CO1 + 0x30], xmm14);
        xorps(xmm14, xmm14);
        movdqu(xword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        movdqu(xword[CO1 + LDC * 1 + 0x10], xmm11);
        xorps(xmm11, xmm11);
        movdqu(xword[CO1 + LDC * 1 + 0x20], xmm13);
        xorps(xmm13, xmm13);
        movdqu(xword[CO1 + LDC * 1 + 0x30], xmm15);
        xorps(xmm15, xmm15);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(lc8, T_NEAR);
        align(4);

        L(l728);
        test(I, 0x1);
        jle(lb48, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm1, xword[AO - 0x70]);
        movdqu(xmm2, xword[AO - 0x60]);
        movdqu(xmm3, xword[AO - 0x50]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l968, T_NEAR);
        sub(H, 0x8);
        jle(l864, T_NEAR);
        align(4);

        L(l770);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        movdqu(xmm2, xword[AO - 0x20]);
        movdqu(xmm3, xword[AO - 0x10]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO]);
        movdqu(xmm1, xword[AO + 0x10]);
        movdqu(xmm2, xword[AO + 0x20]);
        movdqu(xmm3, xword[AO + 0x30]);
        add(AA, 0x4);
        add(AO, 0x80);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l770, T_NEAR);
        align(4);

        L(l864);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l968, T_NEAR);
        align(4);

        L(l874);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        movdqu(xmm2, xword[AO - 0x20]);
        movdqu(xmm3, xword[AO - 0x10]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO]);
        movdqu(xmm1, xword[AO + 0x10]);
        movdqu(xmm2, xword[AO + 0x20]);
        movdqu(xmm3, xword[AO + 0x30]);
        add(AA, 0x4);
        add(AO, 0x80);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l874, T_NEAR);
        align(4);

        L(l968);
        mov(H, K);
        test(H, 0x4);
        je(l9c4, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        add(AO, 0x40);
        add(BO, 0x4);
        align(4);

        L(l9c4);
        mov(H, K);
        test(H, 0x2);
        je(la4c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        punpckhwd(xmm1, xmm6);
        movdqu(xmm2, xword[AO - 0x70]);
        movaps(xmm3, xmm2);
        punpcklwd(xmm2, xmm6);
        punpckhwd(xmm3, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        add(AO, 0x20);
        add(BO, 0x2);
        align(4);

        L(la4c);
        mov(H, K);
        test(H, 0x1);
        je(laf4, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        pshufd(xmm1, xmm3, 0x55);
        punpcklbw(xmm1, xmm6);
        punpcklwd(xmm1, xmm6);
        pshufd(xmm2, xmm3, 0xaa);
        punpcklbw(xmm2, xmm6);
        punpcklwd(xmm2, xmm6);
        pshufd(xmm3, xmm3, 0xff);
        punpcklbw(xmm3, xmm6);
        punpcklwd(xmm3, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm1);
        pmaddwd(xmm6, xmm7);
        paddd(xmm10, xmm6);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm2);
        pmaddwd(xmm6, xmm7);
        paddd(xmm12, xmm6);
        pmaddubsw(xmm4, xmm3);
        pmaddwd(xmm4, xmm7);
        paddd(xmm14, xmm4);
        add(AO, 0x10);
        add(BO, 0x1);
        align(4);

        L(laf4);
        mov(H, qword[COFFSET_RY]);
        movd(xmm0, dword[H]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm8, xmm0);
        paddd(xmm10, xmm0);
        paddd(xmm12, xmm0);
        paddd(xmm14, xmm0);
        add(qword[COFFSET_RY], 0x4);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + 0x10], xmm10);
        xorps(xmm10, xmm10);
        movdqu(xword[CO1 + 0x20], xmm12);
        xorps(xmm12, xmm12);
        movdqu(xword[CO1 + 0x30], xmm14);
        xorps(xmm14, xmm14);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(lb48);
        mov(A, AO);
        sub(J, 0x10);
        cmp(J, 0x10);
        jge(l98, T_NEAR);
        align(4);

        L(lb5c);
        test(J, 0x8);
        jle(l1210, T_NEAR);
        mov(CO1, C);
        add(C, 0x20);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x10);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(H, qword[COFFSET_RX]);
        mov(qword[COFFSET_RY], H);
        mov(I, N);
        cmp(I, 0x2);
        jl(lf68, T_NEAR);
        align(4);

        L(lb9c);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm1, xword[AO - 0x70]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(ld94, T_NEAR);
        sub(H, 0x8);
        jle(lca4, T_NEAR);
        align(4);

        L(lbc8);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        movdqu(xmm0, xword[AO - 0x60]);
        movdqu(xmm1, xword[AO - 0x50]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        add(AA, 0x4);
        add(AO, 0x40);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(lbc8, T_NEAR);
        align(4);

        L(lca4);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(ld94, T_NEAR);
        align(4);

        L(lcb8);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        movdqu(xmm0, xword[AO - 0x60]);
        movdqu(xmm1, xword[AO - 0x50]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        add(AA, 0x4);
        add(AO, 0x40);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(lcb8, T_NEAR);
        align(4);

        L(ld94);
        mov(H, K);
        test(H, 0x4);
        je(ldf0, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        add(AO, 0x20);
        add(BO, 0x8);
        align(4);

        L(ldf0);
        mov(H, K);
        test(H, 0x2);
        je(le6c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        punpckhwd(xmm1, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        add(AO, 0x10);
        add(BO, 0x4);
        align(4);

        L(le6c);
        mov(H, K);
        test(H, 0x1);
        je(lefc, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        pshufd(xmm1, xmm3, 0x55);
        punpcklbw(xmm1, xmm6);
        punpcklwd(xmm1, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm9, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm11, xmm4);
        add(AO, 0x8);
        add(BO, 0x2);
        align(4);

        L(lefc);
        mov(H, qword[COFFSET_RY]);
        movd(xmm0, dword[H]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm8, xmm0);
        paddd(xmm10, xmm0);
        movd(xmm0, dword[H + 0x4]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm9, xmm0);
        paddd(xmm11, xmm0);
        add(qword[COFFSET_RY], 0x8);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + 0x10], xmm10);
        xorps(xmm10, xmm10);
        movdqu(xword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        movdqu(xword[CO1 + LDC * 1 + 0x10], xmm11);
        xorps(xmm11, xmm11);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(lb9c, T_NEAR);
        align(4);

        L(lf68);
        test(I, 0x1);
        jle(l120c, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm1, xword[AO - 0x70]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l10dc, T_NEAR);
        sub(H, 0x8);
        jle(l1038, T_NEAR);
        align(4);

        L(lfa4);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x60]);
        movdqu(xmm1, xword[AO - 0x50]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        add(AA, 0x4);
        add(AO, 0x40);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(lfa4, T_NEAR);
        align(4);

        L(l1038);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l10dc, T_NEAR);
        align(4);

        L(l1048);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        prefetcht0(byte[AO + 0x180]);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x60]);
        movdqu(xmm1, xword[AO - 0x50]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x40]);
        movdqu(xmm1, xword[AO - 0x30]);
        add(AA, 0x4);
        add(AO, 0x40);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1048, T_NEAR);
        align(4);

        L(l10dc);
        mov(H, K);
        test(H, 0x4);
        je(l1114, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        add(AO, 0x20);
        add(BO, 0x4);
        align(4);

        L(l1114);
        mov(H, K);
        test(H, 0x2);
        je(l116c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        punpckhwd(xmm1, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        add(AO, 0x10);
        add(BO, 0x2);
        align(4);

        L(l116c);
        mov(H, K);
        test(H, 0x1);
        je(l11d4, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        pshufd(xmm1, xmm3, 0x55);
        punpcklbw(xmm1, xmm6);
        punpcklwd(xmm1, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        movaps(xmm6, xmm4);
        pmaddubsw(xmm6, xmm0);
        pmaddwd(xmm6, xmm7);
        paddd(xmm8, xmm6);
        pmaddubsw(xmm4, xmm1);
        pmaddwd(xmm4, xmm7);
        paddd(xmm10, xmm4);
        add(AO, 0x8);
        add(BO, 0x1);
        align(4);

        L(l11d4);
        mov(H, qword[COFFSET_RY]);
        movd(xmm0, dword[H]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm8, xmm0);
        paddd(xmm10, xmm0);
        add(qword[COFFSET_RY], 0x4);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + 0x10], xmm10);
        xorps(xmm10, xmm10);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(l120c);
        mov(A, AO);
        align(4);

        L(l1210);
        test(J, 0x4);
        jle(l16c8, T_NEAR);
        mov(CO1, C);
        add(C, 0x10);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x8);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(H, qword[COFFSET_RX]);
        mov(qword[COFFSET_RY], H);
        mov(I, N);
        cmp(I, 0x2);
        jl(l14e4, T_NEAR);
        align(4);

        L(l1250);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l13a4, T_NEAR);
        sub(H, 0x8);
        jle(l1304, T_NEAR);
        align(4);

        L(l1278);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x70]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x60]);
        add(AA, 0x4);
        add(AO, 0x20);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1278, T_NEAR);
        align(4);

        L(l1304);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(l13a4, T_NEAR);
        align(4);

        L(l1318);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x70]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x60]);
        add(AA, 0x4);
        add(AO, 0x20);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1318, T_NEAR);
        align(4);

        L(l13a4);
        mov(H, K);
        test(H, 0x4);
        je(l13e0, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x10);
        add(BO, 0x8);
        align(4);

        L(l13e0);
        mov(H, K);
        test(H, 0x2);
        je(l1434, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x8);
        add(BO, 0x4);
        align(4);

        L(l1434);
        mov(H, K);
        test(H, 0x1);
        je(l1494, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x4);
        add(BO, 0x2);
        align(4);

        L(l1494);
        mov(H, qword[COFFSET_RY]);
        movd(xmm0, dword[H]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm8, xmm0);
        movd(xmm0, dword[H + 0x4]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm9, xmm0);
        add(qword[COFFSET_RY], 0x8);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movdqu(xword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(l1250, T_NEAR);
        align(4);

        L(l14e4);
        test(I, 0x1);
        jle(l16c4, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l15e8, T_NEAR);
        sub(H, 0x8);
        jle(l1578, T_NEAR);
        align(4);

        L(l1514);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x70]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x60]);
        add(AA, 0x4);
        add(AO, 0x20);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1514, T_NEAR);
        align(4);

        L(l1578);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l15e8, T_NEAR);
        align(4);

        L(l1584);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x70]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x60]);
        add(AA, 0x4);
        add(AO, 0x20);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1584, T_NEAR);
        align(4);

        L(l15e8);
        mov(H, K);
        test(H, 0x4);
        je(l1610, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x10);
        add(BO, 0x4);
        align(4);

        L(l1610);
        mov(H, K);
        test(H, 0x2);
        je(l1650, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x8);
        add(BO, 0x2);
        align(4);

        L(l1650);
        mov(H, K);
        test(H, 0x1);
        je(l169c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x4);
        add(BO, 0x1);
        align(4);

        L(l169c);
        mov(H, qword[COFFSET_RY]);
        movd(xmm0, dword[H]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm8, xmm0);
        add(qword[COFFSET_RY], 0x4);
        movdqu(xword[CO1], xmm8);
        xorps(xmm8, xmm8);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(l16c4);
        mov(A, AO);
        align(4);

        L(l16c8);
        test(J, 0x2);
        jle(l1b7c, T_NEAR);
        mov(CO1, C);
        add(C, 0x8);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x4);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(H, qword[COFFSET_RX]);
        mov(qword[COFFSET_RY], H);
        mov(I, N);
        cmp(I, 0x2);
        jl(l1998, T_NEAR);
        align(4);

        L(l1708);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l185c, T_NEAR);
        sub(H, 0x8);
        jle(l17bc, T_NEAR);
        align(4);

        L(l1730);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x78]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x70]);
        add(AA, 0x4);
        add(AO, 0x10);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1730, T_NEAR);
        align(4);

        L(l17bc);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(l185c, T_NEAR);
        align(4);

        L(l17d0);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x78]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x70]);
        add(AA, 0x4);
        add(AO, 0x10);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l17d0, T_NEAR);
        align(4);

        L(l185c);
        mov(H, K);
        test(H, 0x4);
        je(l1898, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x8);
        add(BO, 0x8);
        align(4);

        L(l1898);
        mov(H, K);
        test(H, 0x2);
        je(l18ec, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x4);
        add(BO, 0x4);
        align(4);

        L(l18ec);
        mov(H, K);
        test(H, 0x1);
        je(l194c, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x2);
        add(BO, 0x2);
        align(4);

        L(l194c);
        mov(H, qword[COFFSET_RY]);
        movd(xmm0, dword[H]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm8, xmm0);
        movd(xmm0, dword[H + 0x4]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm9, xmm0);
        add(qword[COFFSET_RY], 0x8);
        movlps(qword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movlps(qword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(l1708, T_NEAR);
        align(4);

        L(l1998);
        test(I, 0x1);
        jle(l1b78, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l1a9c, T_NEAR);
        sub(H, 0x8);
        jle(l1a2c, T_NEAR);
        align(4);

        L(l19c8);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x78]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x70]);
        add(AA, 0x4);
        add(AO, 0x10);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l19c8, T_NEAR);
        align(4);

        L(l1a2c);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l1a9c, T_NEAR);
        align(4);

        L(l1a38);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x78]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x70]);
        add(AA, 0x4);
        add(AO, 0x10);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1a38, T_NEAR);
        align(4);

        L(l1a9c);
        mov(H, K);
        test(H, 0x4);
        je(l1ac4, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x8);
        add(BO, 0x4);
        align(4);

        L(l1ac4);
        mov(H, K);
        test(H, 0x2);
        je(l1b04, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x4);
        add(BO, 0x2);
        align(4);

        L(l1b04);
        mov(H, K);
        test(H, 0x1);
        je(l1b50, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x2);
        add(BO, 0x1);
        align(4);

        L(l1b50);
        mov(H, qword[COFFSET_RY]);
        movd(xmm0, dword[H]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm8, xmm0);
        add(qword[COFFSET_RY], 0x4);
        movlps(qword[CO1], xmm8);
        xorps(xmm8, xmm8);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(l1b78);
        mov(A, AO);
        align(4);

        L(l1b7c);
        test(J, 0x1);
        jle(l2034, T_NEAR);
        mov(CO1, C);
        add(C, 0x4);
        mov(BO, B);
        mov(AA, K);
        shl(AA, 0x2);
        lea(AA, ptr[A + AA * 1 + 0x200]);
        mov(H, qword[COFFSET_RX]);
        mov(qword[COFFSET_RY], H);
        mov(I, N);
        cmp(I, 0x2);
        jl(l1e50, T_NEAR);
        align(4);

        L(l1bbc);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l1d10, T_NEAR);
        sub(H, 0x8);
        jle(l1c70, T_NEAR);
        align(4);

        L(l1be4);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x7c]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x78]);
        add(AA, 0x4);
        add(AO, 0x8);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1be4, T_NEAR);
        align(4);

        L(l1c70);
        prefetcht0(byte[CO1 + 0x3c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x3c]);
        add(H, 0x8);
        jle(l1d10, T_NEAR);
        align(4);

        L(l1c84);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm0, xword[AO - 0x7c]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0xaa);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0xff);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        movdqu(xmm5, xword[BO - 0x70]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x78]);
        add(AA, 0x4);
        add(AO, 0x8);
        add(BO, 0x10);
        sub(H, 0x1);
        jg(l1c84, T_NEAR);
        align(4);

        L(l1d10);
        mov(H, K);
        test(H, 0x4);
        je(l1d4c, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x4);
        add(BO, 0x8);
        align(4);

        L(l1d4c);
        mov(H, K);
        test(H, 0x2);
        je(l1da0, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x2);
        add(BO, 0x4);
        align(4);

        L(l1da0);
        mov(H, K);
        test(H, 0x1);
        je(l1e00, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm9, xmm4);
        add(AO, 0x1);
        add(BO, 0x2);
        align(4);

        L(l1e00);
        mov(H, qword[COFFSET_RY]);
        movd(xmm0, dword[H]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm8, xmm0);
        movd(xmm0, dword[H + 0x4]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm9, xmm0);
        add(qword[COFFSET_RY], 0x8);
        movss(dword[CO1], xmm8);
        xorps(xmm8, xmm8);
        movss(dword[CO1 + LDC * 1], xmm9);
        xorps(xmm9, xmm9);
        lea(CO1, ptr[CO1 + LDC * 2]);
        sub(I, 0x2);
        cmp(I, 0x2);
        jge(l1bbc, T_NEAR);
        align(4);

        L(l1e50);
        test(I, 0x1);
        jle(l2030, T_NEAR);
        mov(AO, A);
        movdqu(xmm0, xword[AO - 0x80]);
        movdqu(xmm5, xword[BO - 0x80]);
        mov(H, K);
        sar(H, 0x3);
        jle(l1f54, T_NEAR);
        sub(H, 0x8);
        jle(l1ee4, T_NEAR);
        align(4);

        L(l1e80);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x7c]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x78]);
        add(AA, 0x4);
        add(AO, 0x8);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1e80, T_NEAR);
        align(4);

        L(l1ee4);
        prefetcht0(byte[CO1 + 0x3c]);
        add(H, 0x8);
        jle(l1f54, T_NEAR);
        align(4);

        L(l1ef0);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO]);
        movdqu(xmm0, xword[AO - 0x7c]);
        prefetcht0(byte[AO + 0x1c0]);
        pshufd(xmm4, xmm5, 0x55);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        movdqu(xmm5, xword[BO - 0x78]);
        prefetcht1(byte[AA - 0x80]);
        movdqu(xmm0, xword[AO - 0x78]);
        add(AA, 0x4);
        add(AO, 0x8);
        add(BO, 0x8);
        sub(H, 0x1);
        jg(l1ef0, T_NEAR);
        align(4);

        L(l1f54);
        mov(H, K);
        test(H, 0x4);
        je(l1f7c, T_NEAR);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x4);
        add(BO, 0x4);
        align(4);

        L(l1f7c);
        mov(H, K);
        test(H, 0x2);
        je(l1fbc, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm0, xword[AO - 0x80]);
        movaps(xmm1, xmm0);
        punpcklwd(xmm0, xmm6);
        movss(xmm5, dword[BO - 0x80]);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x2);
        add(BO, 0x2);
        align(4);

        L(l1fbc);
        mov(H, K);
        test(H, 0x1);
        je(l2008, T_NEAR);
        xorps(xmm6, xmm6);
        movdqu(xmm3, xword[AO - 0x80]);
        pshufd(xmm0, xmm3, 0x0);
        punpcklbw(xmm0, xmm6);
        punpcklwd(xmm0, xmm6);
        movd(xmm5, dword[BO - 0x80]);
        punpcklbw(xmm5, xmm5);
        punpcklwd(xmm5, xmm5);
        pshufd(xmm4, xmm5, 0x0);
        pmaddubsw(xmm4, xmm0);
        pmaddwd(xmm4, xmm7);
        paddd(xmm8, xmm4);
        add(AO, 0x1);
        add(BO, 0x1);
        align(4);

        L(l2008);
        mov(H, qword[COFFSET_RY]);
        movd(xmm0, dword[H]);
        pshufd(xmm0, xmm0, 0x0);
        paddd(xmm8, xmm0);
        add(qword[COFFSET_RY], 0x4);
        movss(dword[CO1], xmm8);
        xorps(xmm8, xmm8);
        lea(CO1, ptr[CO1 + LDC * 1]);
        align(4);

        L(l2030);
        mov(A, AO);
        align(4);

        L(l2034);
        add(rsp, stack_alloc_size);
        postamble();
    }
    outLocalLabel();

#undef M
#undef N
#undef K
#undef A
#undef B
#undef C
#undef LDC
#undef AA
#undef I
#undef J
#undef H
#undef AO
#undef BO
#undef CO1
#undef CO2
#ifdef _WIN32
#undef ARG_A
#undef ARG_B
#endif
#undef ARG_C
#undef ARG_LDC
#undef ARG_COFFSET_R
#undef COFFSET_RX
#undef COFFSET_RY
}

} // namespace x64
} // namespace cpu
} // namespace impl
} // namespace dnnl
